Use raw fastq and generate the quality plots to asses the quality of reads
Filter and trim out bad sequences and bases from our sequencing files
Write out fastq files with high quality sequences
Evaluate the quality from our filter and trim.
Infer errors on forward and reverse reads individually
Identified ASVs on forward and reverse reads separately using the error model.
Merge forward and reverse ASVs into “contigous ASVs”.
Generate ASV count table. (otu_table input for
phyloseq.).
ASV count table: otu_table
Sample information: sample_table track the reads
lost throughout DADA2 workflow.
#Set the raw fastq path to the raw sequencing files
#Path to the fastq files
raw_fastqs_path <- "data/01_DADA2/01_raw_gzipped_fastqs/2017"
raw_fastqs_path## [1] "data/01_DADA2/01_raw_gzipped_fastqs/2017"
## [1] "SRR17060816_1.fastq.gz" "SRR17060816_2.fastq.gz" "SRR17060817_1.fastq.gz"
## [4] "SRR17060817_2.fastq.gz" "SRR17060818_1.fastq.gz" "SRR17060818_2.fastq.gz"
## [7] "SRR17060819_1.fastq.gz" "SRR17060819_2.fastq.gz" "SRR17060820_1.fastq.gz"
## [10] "SRR17060820_2.fastq.gz" "SRR17060821_1.fastq.gz" "SRR17060821_2.fastq.gz"
## [13] "SRR17060824_1.fastq.gz" "SRR17060824_2.fastq.gz" "SRR17060830_1.fastq.gz"
## [16] "SRR17060830_2.fastq.gz" "SRR17060831_1.fastq.gz" "SRR17060831_2.fastq.gz"
## [19] "SRR17060832_1.fastq.gz" "SRR17060832_2.fastq.gz" "SRR17060833_1.fastq.gz"
## [22] "SRR17060833_2.fastq.gz" "SRR17060834_1.fastq.gz" "SRR17060834_2.fastq.gz"
## [25] "SRR17060835_1.fastq.gz" "SRR17060835_2.fastq.gz" "SRR17060836_1.fastq.gz"
## [28] "SRR17060836_2.fastq.gz" "SRR17060837_1.fastq.gz" "SRR17060837_2.fastq.gz"
## [31] "SRR17060838_1.fastq.gz" "SRR17060838_2.fastq.gz" "SRR17060846_1.fastq.gz"
## [34] "SRR17060846_2.fastq.gz" "SRR17060847_1.fastq.gz" "SRR17060847_2.fastq.gz"
## chr [1:36] "SRR17060816_1.fastq.gz" "SRR17060816_2.fastq.gz" ...
#Create a vector of forward reads
forward_reads <- list.files(raw_fastqs_path, pattern = "_1.fastq.gz", full.names = TRUE)
#Intuition check
head(forward_reads)## [1] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060816_1.fastq.gz"
## [2] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060817_1.fastq.gz"
## [3] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060818_1.fastq.gz"
## [4] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060819_1.fastq.gz"
## [5] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060820_1.fastq.gz"
## [6] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060821_1.fastq.gz"
#Create a vector of reverse reads
reverse_reads <-list.files(raw_fastqs_path, pattern = "_2.fastq.gz", full.names = TRUE)
#Intuition check
head(reverse_reads)## [1] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060816_2.fastq.gz"
## [2] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060817_2.fastq.gz"
## [3] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060818_2.fastq.gz"
## [4] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060819_2.fastq.gz"
## [5] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060820_2.fastq.gz"
## [6] "data/01_DADA2/01_raw_gzipped_fastqs/2017/SRR17060821_2.fastq.gz"
# Randomly select 12 samples from dataset to evaluate
# Selecting 12 is typically better than 2 (like we did in class for efficiency)
random_samples <- sample(1:length(reverse_reads), size = 12)
random_samples## [1] 16 15 1 14 6 17 11 13 4 10 12 18
# Calculate and plot quality of these two samples
forward_filteredQual_plot_12 <- plotQualityProfile(forward_reads[random_samples]) +
labs(title = "Forward Read: Raw Quality")
reverse_filteredQual_plot_12 <- plotQualityProfile(reverse_reads[random_samples]) +
labs(title = "Reverse Read: Raw Quality")
# Plot them together with patchwork
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12# Aggregate all QC plots
# Forward reads
forward_preQC_plot <-
plotQualityProfile(forward_reads, aggregate = TRUE) +
labs(title = "Forward Pre-QC")
# reverse reads
reverse_preQC_plot <-
plotQualityProfile(reverse_reads, aggregate = TRUE) +
labs(title = "Reverse Pre-QC")
preQC_aggregate_plot <-
# Plot the forward and reverse together
forward_preQC_plot + reverse_preQC_plot
# Show the plot
preQC_aggregate_plot# vector of our samples, extract the sample information from our file
samples <-
scan("/local/workdir/cab565/git_repos/Alessandro/data/00_cutadapt/2017_samples.txt",
what = "character")
#Intuition check
head(samples)## [1] "SRR17060816" "SRR17060817" "SRR17060818" "SRR17060819" "SRR17060820"
## [6] "SRR17060821"
#place filtered reads into filtered_fastqs_path
filtered_fastqs_path <- "data/01_DADA2/02b_2017_filtered_fastqs/"
filtered_fastqs_path## [1] "data/01_DADA2/02b_2017_filtered_fastqs/"
# create 2 variables : filtered_F, filtered_R
filtered_forward_reads <-
file.path(filtered_fastqs_path, paste0(samples, "_R1_filtered.fastq.gz"))
#Intuition check
head(filtered_forward_reads)## [1] "data/01_DADA2/02b_2017_filtered_fastqs//SRR17060816_R1_filtered.fastq.gz"
## [2] "data/01_DADA2/02b_2017_filtered_fastqs//SRR17060817_R1_filtered.fastq.gz"
## [3] "data/01_DADA2/02b_2017_filtered_fastqs//SRR17060818_R1_filtered.fastq.gz"
## [4] "data/01_DADA2/02b_2017_filtered_fastqs//SRR17060819_R1_filtered.fastq.gz"
## [5] "data/01_DADA2/02b_2017_filtered_fastqs//SRR17060820_R1_filtered.fastq.gz"
## [6] "data/01_DADA2/02b_2017_filtered_fastqs//SRR17060821_R1_filtered.fastq.gz"
## [1] 18
filtered_reverse_reads <-
file.path(filtered_fastqs_path, paste0(samples, "_R2_filtered.fastq.gz"))
#Intuition check
length(filtered_reverse_reads)## [1] 18
Parameters of filter and trim DEPEND ON THE DATASET
maxN = number of N bases. Remove all Ns from the
data.maxEE = quality filtering threshold applied to expected
errors. By default, all expected errors. Mar recommends using c(1,1).
Here, if there is maxEE expected errors, its okay. If more, throw away
sequence.trimLeft = trim certain number of base pairs on start
of each readtruncQ = truncate reads at the first instance of a
quality score less than or equal to selected number. Chose 2rm.phix = remove phi xcompress = make filtered files .gzippedmultithread = multithread#Assign a vector to filtered reads
#Trim out poor bases
#Write out filtered fastq files
filtered_reads <-
filterAndTrim(fwd = forward_reads, filt = filtered_forward_reads,
rev = reverse_reads, filt.rev = filtered_reverse_reads,
truncLen = c(275,250), trimLeft = c(17,21),
maxN = 0, maxEE = c(1, 1),truncQ = 2, rm.phix = TRUE,
compress = TRUE, multithread = 6)
# Primers are V3-V4
# 341F (5′-CCT ACG GGN GGC WGC AG-3′) (17 bp) from Herlemann et al. 2011
# 785R (5′-GAC TAC HVG GGT ATC TAA TCC-3′)(21 bp) from Herlemann et al. 2011# Plot the 12 random samples after QC
forward_filteredQual_plot_12 <-
plotQualityProfile(filtered_forward_reads[random_samples]) +
labs(title = "Trimmed Forward Read Quality")
reverse_filteredQual_plot_12 <-
plotQualityProfile(filtered_reverse_reads[random_samples]) +
labs(title = "Trimmed Reverse Read Quality")
# Put the two plots together
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12# Aggregate all QC plots
# Forward reads
forward_postQC_plot <-
plotQualityProfile(filtered_forward_reads, aggregate = TRUE) +
labs(title = "Forward Post-QC")
# reverse reads
reverse_postQC_plot <-
plotQualityProfile(filtered_reverse_reads, aggregate = TRUE) +
labs(title = "Reverse Post-QC")
postQC_aggregate_plot <-
# Plot the forward and reverse together
forward_postQC_plot + reverse_postQC_plot
# Show the plot
postQC_aggregate_plotfilterAndTrim## reads.in reads.out
## SRR17060816_1.fastq.gz 285558 64825
## SRR17060817_1.fastq.gz 676817 122659
## SRR17060818_1.fastq.gz 591364 125713
## SRR17060819_1.fastq.gz 379452 88479
## SRR17060820_1.fastq.gz 570270 128505
## SRR17060821_1.fastq.gz 556682 133522
# calculate some stats
filtered_df %>%
reframe(median_reads_in = median(reads.in),
median_reads_out = median(reads.out),
median_percent_retained = (median(reads.out)/median(reads.in)))## median_reads_in median_reads_out median_percent_retained
## 1 455635.5 117175 0.2571683
About 25 percent of reads are retained when maxEE = 1. Quality control plots look good. For the reverse reads, there are more reads with quality scores lower than 30. They occur after 150 bp.
Note every sequencing run needs to be run
separately! The error model MUST be run separately on
each illumina dataset. If you’d like to combine the datasets from
multiple sequencing runs, you’ll need to do the exact same
filterAndTrim() step AND, very importantly, you’ll
need to have the same primer and ASV length expected by the output.
Infer error rates for all possible transitions within purines and pyrimidines (A<>G or C<>T) and transversions between all purine and pyrimidine combinations.
Error model is learned by alternating estimation of the error rates and inference of sample composition until they converge.
## 103632408 total bases in 401676 reads from 4 samples will be used for learning the error rates.
#Plot forward reads errors
forward_error_plot <-
plotErrors(error_forward_reads, nominalQ = TRUE) +
labs(title = "Forward Read Error Model")
#Reverse reads
error_reverse_reads <-
learnErrors(filtered_reverse_reads, multithread = 6)## 121411449 total bases in 530181 reads from 5 samples will be used for learning the error rates.
#Plot reverse reads errors
reverse_error_plot <-
plotErrors(error_reverse_reads, nominalQ = TRUE) +
labs(title = "Reverse Read Error Model")
#Put the two plots together
forward_error_plot + reverse_error_plot## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
The points do a pretty good job of matching the black lines.
Details of the plot: - Points: The observed error
rates for each consensus quality score.
- Black line: Estimated error rates after convergence
of the machine-learning algorithm.
- Red line: The error rates expected under the nominal
definition of the Q-score.
Similar to what is mentioned in the dada2 tutorial: the estimated error rates (black line) are a “reasonably good” fit to the observed rates (points), and the error rates drop with increased quality as expected. We can now infer ASVs!
An important note: This process occurs separately on forward and reverse reads! This is quite a different approach from how OTUs are identified in Mothur and also from UCHIME, oligotyping, and other OTU, MED, and ASV approaches.
#Infer forward ASVs
dada_forward <- dada(filtered_forward_reads,
err = error_forward_reads,
multithread = 6)## Sample 1 - 64825 reads in 16052 unique sequences.
## Sample 2 - 122659 reads in 7207 unique sequences.
## Sample 3 - 125713 reads in 11422 unique sequences.
## Sample 4 - 88479 reads in 16058 unique sequences.
## Sample 5 - 128505 reads in 12476 unique sequences.
## Sample 6 - 133522 reads in 10879 unique sequences.
## Sample 7 - 87485 reads in 11547 unique sequences.
## Sample 8 - 7184 reads in 1942 unique sequences.
## Sample 9 - 114844 reads in 15535 unique sequences.
## Sample 10 - 64644 reads in 7629 unique sequences.
## Sample 11 - 120738 reads in 16282 unique sequences.
## Sample 12 - 119506 reads in 24424 unique sequences.
## Sample 13 - 124383 reads in 20645 unique sequences.
## Sample 14 - 126386 reads in 9485 unique sequences.
## Sample 15 - 68948 reads in 15339 unique sequences.
## Sample 16 - 127415 reads in 14252 unique sequences.
## Sample 17 - 98952 reads in 16424 unique sequences.
## Sample 18 - 57123 reads in 8619 unique sequences.
#Infer reverse ASVs
dada_reverse <- dada(filtered_reverse_reads,
err = error_reverse_reads,
multithread = 6)## Sample 1 - 64825 reads in 16522 unique sequences.
## Sample 2 - 122659 reads in 16535 unique sequences.
## Sample 3 - 125713 reads in 17289 unique sequences.
## Sample 4 - 88479 reads in 25463 unique sequences.
## Sample 5 - 128505 reads in 19784 unique sequences.
## Sample 6 - 133522 reads in 20225 unique sequences.
## Sample 7 - 87485 reads in 17623 unique sequences.
## Sample 8 - 7184 reads in 2608 unique sequences.
## Sample 9 - 114844 reads in 27041 unique sequences.
## Sample 10 - 64644 reads in 13452 unique sequences.
## Sample 11 - 120738 reads in 22866 unique sequences.
## Sample 12 - 119506 reads in 34239 unique sequences.
## Sample 13 - 124383 reads in 33256 unique sequences.
## Sample 14 - 126386 reads in 18040 unique sequences.
## Sample 15 - 68948 reads in 24569 unique sequences.
## Sample 16 - 127415 reads in 21086 unique sequences.
## Sample 17 - 98952 reads in 24415 unique sequences.
## Sample 18 - 57123 reads in 15247 unique sequences.
## $SRR17060816_R1_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 461 sequence variants were inferred from 16052 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## $SRR17060816_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 326 sequence variants were inferred from 16522 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## $SRR17060834_R1_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 1736 sequence variants were inferred from 24424 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## $SRR17060834_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 1143 sequence variants were inferred from 34239 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
Now, merge the forward and reverse ASVs into contigs.
# merge forward and reverse ASVs
merged_ASVs <- mergePairs(dada_forward, filtered_forward_reads,
dada_reverse, filtered_reverse_reads,
verbose = TRUE)## 61597 paired-reads (in 616 unique pairings) successfully merged out of 63526 (in 1570 pairings) input.
## 120825 paired-reads (in 340 unique pairings) successfully merged out of 121793 (in 666 pairings) input.
## 123854 paired-reads (in 305 unique pairings) successfully merged out of 124930 (in 650 pairings) input.
## 86347 paired-reads (in 636 unique pairings) successfully merged out of 87537 (in 889 pairings) input.
## 126896 paired-reads (in 336 unique pairings) successfully merged out of 127832 (in 537 pairings) input.
## 132053 paired-reads (in 329 unique pairings) successfully merged out of 132889 (in 517 pairings) input.
## 85543 paired-reads (in 380 unique pairings) successfully merged out of 86729 (in 764 pairings) input.
## 6842 paired-reads (in 143 unique pairings) successfully merged out of 7045 (in 170 pairings) input.
## 113686 paired-reads (in 385 unique pairings) successfully merged out of 114554 (in 481 pairings) input.
## 63311 paired-reads (in 367 unique pairings) successfully merged out of 64206 (in 466 pairings) input.
## 118599 paired-reads (in 365 unique pairings) successfully merged out of 119938 (in 612 pairings) input.
## 112894 paired-reads (in 1507 unique pairings) successfully merged out of 116722 (in 2369 pairings) input.
## 121780 paired-reads (in 708 unique pairings) successfully merged out of 123187 (in 1073 pairings) input.
## 125536 paired-reads (in 179 unique pairings) successfully merged out of 126044 (in 270 pairings) input.
## 63829 paired-reads (in 1285 unique pairings) successfully merged out of 66635 (in 1877 pairings) input.
## 125671 paired-reads (in 374 unique pairings) successfully merged out of 126924 (in 635 pairings) input.
## 95893 paired-reads (in 505 unique pairings) successfully merged out of 97746 (in 904 pairings) input.
## 55358 paired-reads (in 258 unique pairings) successfully merged out of 56525 (in 439 pairings) input.
## [1] "list"
## [1] 18
## [1] "SRR17060816_R1_filtered.fastq.gz" "SRR17060817_R1_filtered.fastq.gz"
## [3] "SRR17060818_R1_filtered.fastq.gz" "SRR17060819_R1_filtered.fastq.gz"
## [5] "SRR17060820_R1_filtered.fastq.gz" "SRR17060821_R1_filtered.fastq.gz"
## [7] "SRR17060824_R1_filtered.fastq.gz" "SRR17060830_R1_filtered.fastq.gz"
## [9] "SRR17060831_R1_filtered.fastq.gz" "SRR17060832_R1_filtered.fastq.gz"
## [11] "SRR17060833_R1_filtered.fastq.gz" "SRR17060834_R1_filtered.fastq.gz"
## [13] "SRR17060835_R1_filtered.fastq.gz" "SRR17060836_R1_filtered.fastq.gz"
## [15] "SRR17060837_R1_filtered.fastq.gz" "SRR17060838_R1_filtered.fastq.gz"
## [17] "SRR17060846_R1_filtered.fastq.gz" "SRR17060847_R1_filtered.fastq.gz"
# Create the ASV Count Table
raw_ASV_table <- makeSequenceTable(merged_ASVs)
# Write out the file to data/01_DADA2
# Check the type and dimensions of the data
dim(raw_ASV_table)## [1] 18 5510
## [1] "matrix" "array"
## [1] "integer"
# Inspect the distribution of sequence lengths of all ASVs in dataset
table(nchar(getSequences(raw_ASV_table)))##
## 258 259 260 261 290 300 326 350 355 372 373 374 382 383 384 385
## 12 12 43 1 1 2 1 5 1 2 3 1 3 3 4 1
## 386 389 396 400 401 402 403 404 405 406 407 408 409 410 411 412
## 4 2 1 2 70 607 294 191 513 67 19 11 8 18 4 9
## 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
## 2 9 4 9 9 8 9 9 65 1304 61 42 19 315 1238 443
## 429 430 431 432 450 451 454 455 460
## 32 7 1 1 3 1 2 1 1
# Inspect the distribution of sequence lengths of all ASVs in dataset
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Raw distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###################################################
###################################################
# TRIM THE ASVS
# Let's trim the ASVs to only be the right size, which is 421-429.
# We will allow for a few
raw_ASV_table_trimmed <- raw_ASV_table[,nchar(colnames(raw_ASV_table)) %in% 421:429]
# Inspect the distribution of sequence lengths of all ASVs in dataset
table(nchar(getSequences(raw_ASV_table_trimmed)))##
## 421 422 423 424 425 426 427 428 429
## 65 1304 61 42 19 315 1238 443 32
## [1] 0.8249287
# Inspect the distribution of sequence lengths of all ASVs in dataset
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Trimmed distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Let's zoom in on the plot
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Trimmed distribution of ASV length") +
scale_y_continuous(limits = c(0, 500))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Taking into account the lower, zoomed-in plot. Do we want to remove those extra ASVs? No
Sometimes chimeras arise in our workflow.
Chimeric sequences are artificial sequences formed by the combination of two or more distinct biological sequences. These chimeric sequences can arise during the polymerase chain reaction (PCR) amplification step of the 16S rRNA gene, where fragments from different templates can be erroneously joined together.
Chimera removal is an essential step in the analysis of 16S sequencing data to improve the accuracy of downstream analyses, such as taxonomic assignment and diversity assessment. It helps to avoid the inclusion of misleading or spurious sequences that could lead to incorrect biological interpretations.
# Remove the chimeras in the raw ASV table
noChimeras_ASV_table <- removeBimeraDenovo(raw_ASV_table_trimmed,
method="consensus",
multithread=6, verbose=TRUE)## Identified 1276 bimeras out of 3519 input sequences.
## [1] 18 2243
## [1] 0.9561805
## [1] 0.7887808
# Plot it
data.frame(Seq_Length_NoChim = nchar(getSequences(noChimeras_ASV_table))) %>%
ggplot(aes(x = Seq_Length_NoChim )) +
geom_histogram()+
labs(title = "Trimmed + Chimera Removal distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Here, we will look at the number of reads that were lost in the filtering, denoising, merging, and chimera removal.
# A little function to identify number seqs
getN <- function(x) sum(getUniques(x))
# Make the table to track the seqs
track <- cbind(filtered_reads,
sapply(dada_forward, getN),
sapply(dada_reverse, getN),
sapply(merged_ASVs, getN),
rowSums(noChimeras_ASV_table))
head(track)## reads.in reads.out
## SRR17060816_1.fastq.gz 285558 64825 63902 64385 61597 47244
## SRR17060817_1.fastq.gz 676817 122659 122122 122253 120825 114882
## SRR17060818_1.fastq.gz 591364 125713 125166 125385 123854 115030
## SRR17060819_1.fastq.gz 379452 88479 87940 87995 86347 51420
## SRR17060820_1.fastq.gz 570270 128505 128060 128209 126896 107865
## SRR17060821_1.fastq.gz 556682 133522 133151 133213 132053 120482
# Update column names to be more informative (most are missing at the moment!)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")
rownames(track) <- samples
# Generate a dataframe to track the reads through our DADA2 pipeline
track_counts_df <-
track %>%
# make it a dataframe
as.data.frame() %>%
rownames_to_column(var = "names") %>%
mutate(perc_reads_retained = 100 * nochim / input)
# Visualize it in table format
DT::datatable(track_counts_df)# Plot it!
track_counts_df %>%
pivot_longer(input:nochim, names_to = "read_type", values_to = "num_reads") %>%
mutate(read_type = fct_relevel(read_type,
"input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")) %>%
ggplot(aes(x = read_type, y = num_reads, fill = read_type)) +
geom_line(aes(group = names), color = "grey") +
geom_point(shape = 21, size = 3, alpha = 0.8) +
scale_fill_brewer(palette = "Spectral") +
labs(x = "Filtering Step", y = "Number of Sequences") +
theme_bw()Below, we will prepare the following:
ASV_fastas: A fasta file that we can use to build a
tree for phylogenetic analyses (e.g. phylogenetic alpha diversity
metrics or UNIFRAC dissimilarty).########### 2. COUNT TABLE ###############
############## Modify the ASV names and then save a fasta file! ##############
# Give headers more manageable names
# First pull the ASV sequences
asv_seqs <- colnames(noChimeras_ASV_table)
asv_seqs[1:5]## [1] "TGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCGAGGAGGAAAGGGATGTTGCTAATATCAGCATCCTGTGACGTTACTCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGGCATCCAAAACTGAGAGGCTCGAGTGCGGAAGAGGAGTGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACACTCTGGTCTGACACTGACGCTGAGGTACGAAAGCGTGGGGAGCAAACA"
## [2] "CCAAGAATATTCCGCAATGGGGGAAACCCTGACGGAGCGACACTGCGTGAATGATGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTGGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTCCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCCAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA"
## [3] "CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTATGTAGGAAATGACATAATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTAGGGCTCAACTCTAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTGAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA"
## [4] "CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTAGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTTCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA"
## [5] "TGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCAGTGAGGAAGGTAATGTAGTTAATACCTGCATTATTTGACGTTAGCTGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATCGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACG"
# make headers for our ASV seq fasta file, which will be our asv names
asv_headers <- vector(dim(noChimeras_ASV_table)[2], mode = "character")
asv_headers[1:5]## [1] "" "" "" "" ""
# loop through vector and fill it in with ASV names
for (i in 1:dim(noChimeras_ASV_table)[2]) {
asv_headers[i] <- paste(">ASV", i, sep = "_")
}
# intitution check
asv_headers[1:5]## [1] ">ASV_1" ">ASV_2" ">ASV_3" ">ASV_4" ">ASV_5"
01_DADA2 filesNow, we will write the files! We will write the following to the
data/01_DADA2/ folder. We will save both as files that
could be submitted as supplements AND as .RData objects for easy loading
into the next steps into R.:
ASV_counts.tsv: ASV count table that has ASV names that
are re-written and shortened headers like ASV_1, ASV_2, etc, which will
match the names in our fasta file below. This will also be saved as
data/01_DADA2/06_2017_analysis/ASV_counts.RData.ASV_counts_withSeqNames.tsv: This is generated with the
data object in this file known as noChimeras_ASV_table. ASV
headers include the entire ASV sequence ~250bps. In addition,
we will save this as a .RData object as
data/01_DADA2/06_2017_analysis/noChimeras_ASV_table.RData
as we will use this data in analysis/02_PreProcessing.Rmd
to assign the taxonomy from the sequence headers.ASVs.fasta: A fasta file output of the ASV names from
ASV_counts.tsv and the sequences from the ASVs in
ASV_counts_withSeqNames.tsv. A fasta file that we can use
to build a tree for phylogenetic analyses (e.g. phylogenetic alpha
diversity metrics or UNIFRAC dissimilarty).ASVs.fasta in
data/02_PreProcessing/ to be used for the taxonomy
classification in the next step in the workflow.track_read_counts.RData: To track how many reads we
lost throughout our workflow that could be used and plotted later. We
will add this to the metadata in
analysis/02_ProProcessing.Rmd.# FIRST, we will save our output as regular files, which will be useful later on.
# Save to regular .tsv file
# Write BOTH the modified and unmodified ASV tables to a file!
# Write count table with ASV numbered names (e.g. ASV_1, ASV_2, etc)
write.table(asv_tab, "data/01_DADA2/Alessandro_2017/ASV_counts.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write count table with ASV sequence names
write.table(noChimeras_ASV_table, "data/01_DADA2/Alessandro_2017/ASV_counts_withSeqNames.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write out the fasta file for reference later on for what seq matches what ASV
asv_fasta <- c(rbind(asv_headers, asv_seqs))
# Save to a file!
write(asv_fasta, "data/01_DADA2/Alessandro_2017/ASVs.fasta")
# SECOND, let's save to a RData object
# Each of these files will be used in the analysis/02_Taxonomic_Assignment
# RData objects are for easy loading :)
save(noChimeras_ASV_table, file = "data/01_DADA2/Alessandro_2017/noChimeras_ASV_table.RData")
save(asv_tab, file = "data/01_DADA2/Alessandro_2017/ASV_counts.RData")
# And save the track_counts_df a R object, which we will merge with metadata information in the next step of the analysis in nalysis/02_Taxonomic_Assignment.
save(track_counts_df, file = "data/01_DADA2/Alessandro_2017/track_read_counts.RData")##Session information
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.3.2 (2023-10-31)
## os Rocky Linux 9.0 (Blue Onyx)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2024-04-25
## pandoc 3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [2] CRAN (R 4.3.2)
## ade4 1.7-22 2023-02-06 [1] CRAN (R 4.3.2)
## ape 5.8 2024-04-11 [1] CRAN (R 4.3.2)
## Biobase 2.62.0 2023-10-24 [2] Bioconductor
## BiocGenerics 0.48.1 2023-11-01 [2] Bioconductor
## BiocParallel 1.36.0 2023-10-24 [2] Bioconductor
## biomformat 1.30.0 2023-10-24 [1] Bioconductor
## Biostrings 2.70.1 2023-10-25 [2] Bioconductor
## bitops 1.0-7 2021-04-24 [2] CRAN (R 4.3.2)
## bslib 0.5.1 2023-08-11 [2] CRAN (R 4.3.2)
## cachem 1.0.8 2023-05-01 [2] CRAN (R 4.3.2)
## callr 3.7.3 2022-11-02 [2] CRAN (R 4.3.2)
## cli 3.6.1 2023-03-23 [2] CRAN (R 4.3.2)
## cluster 2.1.4 2022-08-22 [2] CRAN (R 4.3.2)
## codetools 0.2-19 2023-02-01 [2] CRAN (R 4.3.2)
## colorspace 2.1-0 2023-01-23 [2] CRAN (R 4.3.2)
## crayon 1.5.2 2022-09-29 [2] CRAN (R 4.3.2)
## crosstalk 1.2.0 2021-11-04 [2] CRAN (R 4.3.2)
## dada2 * 1.30.0 2023-10-24 [1] Bioconductor
## data.table 1.14.8 2023-02-17 [2] CRAN (R 4.3.2)
## DelayedArray 0.28.0 2023-10-24 [2] Bioconductor
## deldir 1.0-9 2023-05-17 [2] CRAN (R 4.3.2)
## devtools * 2.4.4 2022-07-20 [2] CRAN (R 4.2.1)
## digest 0.6.33 2023-07-07 [2] CRAN (R 4.3.2)
## dplyr * 1.1.3 2023-09-03 [2] CRAN (R 4.3.2)
## DT * 0.32 2024-02-19 [1] CRAN (R 4.3.2)
## ellipsis 0.3.2 2021-04-29 [2] CRAN (R 4.3.2)
## evaluate 0.23 2023-11-01 [2] CRAN (R 4.3.2)
## fansi 1.0.5 2023-10-08 [2] CRAN (R 4.3.2)
## farver 2.1.1 2022-07-06 [2] CRAN (R 4.3.2)
## fastmap 1.1.1 2023-02-24 [2] CRAN (R 4.3.2)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.2)
## foreach 1.5.2 2022-02-02 [2] CRAN (R 4.3.2)
## fs 1.6.3 2023-07-20 [2] CRAN (R 4.3.2)
## generics 0.1.3 2022-07-05 [2] CRAN (R 4.3.2)
## GenomeInfoDb 1.38.0 2023-10-24 [2] Bioconductor
## GenomeInfoDbData 1.2.11 2023-11-07 [2] Bioconductor
## GenomicAlignments 1.38.0 2023-10-24 [2] Bioconductor
## GenomicRanges 1.54.1 2023-10-29 [2] Bioconductor
## ggplot2 * 3.5.0 2024-02-23 [2] CRAN (R 4.3.2)
## glue 1.6.2 2022-02-24 [2] CRAN (R 4.3.2)
## gtable 0.3.4 2023-08-21 [2] CRAN (R 4.3.2)
## highr 0.10 2022-12-22 [2] CRAN (R 4.3.2)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.2)
## htmltools 0.5.7 2023-11-03 [2] CRAN (R 4.3.2)
## htmlwidgets 1.6.2 2023-03-17 [2] CRAN (R 4.3.2)
## httpuv 1.6.12 2023-10-23 [2] CRAN (R 4.3.2)
## hwriter 1.3.2.1 2022-04-08 [1] CRAN (R 4.3.2)
## igraph 1.5.1 2023-08-10 [2] CRAN (R 4.3.2)
## interp 1.1-6 2024-01-26 [1] CRAN (R 4.3.2)
## IRanges 2.36.0 2023-10-24 [2] Bioconductor
## iterators 1.0.14 2022-02-05 [2] CRAN (R 4.3.2)
## jpeg 0.1-10 2022-11-29 [1] CRAN (R 4.3.2)
## jquerylib 0.1.4 2021-04-26 [2] CRAN (R 4.3.2)
## jsonlite 1.8.7 2023-06-29 [2] CRAN (R 4.3.2)
## knitr 1.45 2023-10-30 [2] CRAN (R 4.3.2)
## labeling 0.4.3 2023-08-29 [2] CRAN (R 4.3.2)
## later 1.3.1 2023-05-02 [2] CRAN (R 4.3.2)
## lattice 0.21-9 2023-10-01 [2] CRAN (R 4.3.2)
## latticeExtra 0.6-30 2022-07-04 [1] CRAN (R 4.3.2)
## lifecycle 1.0.3 2022-10-07 [2] CRAN (R 4.3.2)
## lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.2)
## magrittr 2.0.3 2022-03-30 [2] CRAN (R 4.3.2)
## MASS 7.3-60 2023-05-04 [2] CRAN (R 4.3.2)
## Matrix 1.6-1.1 2023-09-18 [2] CRAN (R 4.3.2)
## MatrixGenerics 1.14.0 2023-10-24 [2] Bioconductor
## matrixStats 1.1.0 2023-11-07 [2] CRAN (R 4.3.2)
## memoise 2.0.1 2021-11-26 [2] CRAN (R 4.3.2)
## mgcv 1.9-0 2023-07-11 [2] CRAN (R 4.3.2)
## mime 0.12 2021-09-28 [2] CRAN (R 4.3.2)
## miniUI 0.1.1.1 2018-05-18 [2] CRAN (R 4.3.2)
## multtest 2.58.0 2023-10-24 [1] Bioconductor
## munsell 0.5.0 2018-06-12 [2] CRAN (R 4.3.2)
## nlme 3.1-163 2023-08-09 [2] CRAN (R 4.3.2)
## pacman 0.5.1 2019-03-11 [1] CRAN (R 4.3.2)
## patchwork * 1.2.0.9000 2024-03-12 [1] Github (thomasp85/patchwork@d943757)
## permute 0.9-7 2022-01-27 [1] CRAN (R 4.3.2)
## phyloseq * 1.41.1 2024-03-09 [1] Github (joey711/phyloseq@c260561)
## pillar 1.9.0 2023-03-22 [2] CRAN (R 4.3.2)
## pkgbuild 1.4.2 2023-06-26 [2] CRAN (R 4.3.2)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.3.2)
## pkgload 1.3.3 2023-09-22 [2] CRAN (R 4.3.2)
## plyr 1.8.9 2023-10-02 [2] CRAN (R 4.3.2)
## png 0.1-8 2022-11-29 [2] CRAN (R 4.3.2)
## prettyunits 1.2.0 2023-09-24 [2] CRAN (R 4.3.2)
## processx 3.8.2 2023-06-30 [2] CRAN (R 4.3.2)
## profvis 0.3.8 2023-05-02 [2] CRAN (R 4.3.2)
## promises 1.2.1 2023-08-10 [2] CRAN (R 4.3.2)
## ps 1.7.5 2023-04-18 [2] CRAN (R 4.3.2)
## purrr * 1.0.2 2023-08-10 [2] CRAN (R 4.3.2)
## R6 2.5.1 2021-08-19 [2] CRAN (R 4.3.2)
## RColorBrewer 1.1-3 2022-04-03 [2] CRAN (R 4.3.2)
## Rcpp * 1.0.11 2023-07-06 [2] CRAN (R 4.3.2)
## RcppParallel 5.1.7 2023-02-27 [2] CRAN (R 4.3.2)
## RCurl 1.98-1.13 2023-11-02 [2] CRAN (R 4.3.2)
## readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.2)
## remotes 2.4.2.1 2023-07-18 [2] CRAN (R 4.3.2)
## reshape2 1.4.4 2020-04-09 [2] CRAN (R 4.3.2)
## rhdf5 2.46.1 2023-11-29 [1] Bioconductor 3.18 (R 4.3.2)
## rhdf5filters 1.14.1 2023-11-06 [1] Bioconductor
## Rhdf5lib 1.24.2 2024-02-07 [1] Bioconductor 3.18 (R 4.3.2)
## rlang 1.1.2 2023-11-04 [2] CRAN (R 4.3.2)
## rmarkdown 2.25 2023-09-18 [2] CRAN (R 4.3.2)
## Rsamtools 2.18.0 2023-10-24 [2] Bioconductor
## rstudioapi 0.15.0 2023-07-07 [2] CRAN (R 4.3.2)
## S4Arrays 1.2.0 2023-10-24 [2] Bioconductor
## S4Vectors 0.40.1 2023-10-26 [2] Bioconductor
## sass 0.4.7 2023-07-15 [2] CRAN (R 4.3.2)
## scales 1.3.0 2023-11-28 [2] CRAN (R 4.3.2)
## sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.3.2)
## shiny 1.7.5.1 2023-10-14 [2] CRAN (R 4.3.2)
## ShortRead 1.60.0 2023-10-24 [1] Bioconductor
## SparseArray 1.2.1 2023-11-05 [2] Bioconductor
## stringi 1.7.12 2023-01-11 [2] CRAN (R 4.3.2)
## stringr * 1.5.0 2022-12-02 [2] CRAN (R 4.3.2)
## SummarizedExperiment 1.32.0 2023-10-24 [2] Bioconductor
## survival 3.5-7 2023-08-14 [2] CRAN (R 4.3.2)
## tibble * 3.2.1 2023-03-20 [2] CRAN (R 4.3.2)
## tidyr * 1.3.0 2023-01-24 [2] CRAN (R 4.3.2)
## tidyselect 1.2.0 2022-10-10 [2] CRAN (R 4.3.2)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.2)
## timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.2)
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.2)
## urlchecker 1.0.1 2021-11-30 [2] CRAN (R 4.3.2)
## usethis * 2.2.2 2023-07-06 [2] CRAN (R 4.3.2)
## utf8 1.2.4 2023-10-22 [2] CRAN (R 4.3.2)
## vctrs 0.6.4 2023-10-12 [2] CRAN (R 4.3.2)
## vegan 2.6-4 2022-10-11 [1] CRAN (R 4.3.2)
## withr 2.5.2 2023-10-30 [2] CRAN (R 4.3.2)
## xfun 0.41 2023-11-01 [2] CRAN (R 4.3.2)
## xtable 1.8-4 2019-04-21 [2] CRAN (R 4.3.2)
## XVector 0.42.0 2023-10-24 [2] Bioconductor
## yaml 2.3.7 2023-01-23 [2] CRAN (R 4.3.2)
## zlibbioc 1.48.0 2023-10-24 [2] Bioconductor
##
## [1] /home/cab565/R/x86_64-pc-linux-gnu-library/4.3
## [2] /programs/R-4.3.2/library
##
## ──────────────────────────────────────────────────────────────────────────────